###############################################################################-
# Created By: Pietryka
# Creation Date:  2017-03-15
# purpose: create corpora
# Contact: mpietryka@fsu.edu
###############################################################################-

# 1. LOAD DATA AND PACKAGES   ----------------



## 1A. LOAD PACKAGES  ------------

library(tidyverse)   # DATA CLEANING FUNCTIONS
library(stringr)     # STRING FUNCTIONS
library(textreuse)    # TEXT ANALYSIS FUNCTIONS


## 1B. LOAD DATA ----------------


# This data file contains the original documents that we obtained from the
# NBER/Maryland State Constitutions Project
# (http://www.stateconstitutions.umd.edu/index.aspx). We then prepared the text
# following standard text-analysis procedures (e.g, Silge and Robinson 2017),
# transforming all text to lowercase, removing punctuation, removing stop words
# defined by the SMART information retrieval system, and then stemming the
# remaining words using the Snowball algorithm (http://snowballstem.org/).

doc_df <- read_rds("../Data/Derived/doc_df.rds")

## 1C. MOVE FROM WIDE TO LONG DATA --------------

long_df <- doc_df  %>%
  gather(topic, text, ends_with("_text"), ends_with("_nostop"))

## 1D. DEFINE FUNCTIONS ----------------

# FUNCTION TO CREATE CORPUS

create_corpus <- function(df, ids = document_id, ...) {
  # EXTRACT ID COLUMN FROM 'df'
  ids <- enquo(ids)
  ids <- df  %>% select(!!ids)  %>% pull()
  # EXTRACT TEXT, ADD IDs AND NAMES
  the_text <- df$text
  names(the_text) <- ids
  # CREATE CORPUS
  corpus <- TextReuseCorpus(
    text = the_text,
    tokenizer = tokenize_ngrams,
    lowercase = TRUE,
    keep_tokens = TRUE,
    ...
  )
  return(corpus)
}


# FUNCTION TO CREATE CORPUS BASED ON SKIP GRAMS

skip_corpus <- function(df, ids = document_id) {
  # EXTRACT ID COLUMN FROM 'df'
  ids <- enquo(ids)
  ids <- df  %>% select(!!ids)  %>% pull()
  # EXTRACT TEXT, ADD IDs AND NAMES
  the_text <- df$text
  names(the_text) <- ids
  # CREATE CORPUS
  corpus <- TextReuseCorpus(
    text = the_text,
    tokenizer = tokenize_skip_ngrams,
    lowercase = TRUE,
    n = 5,
    k = 1
  )
  return(corpus)
}

# FUNCTION TO EXTRACT N-GRAMS
extract_ngrams <- function(corpus){
  corpus  %>%
    tokens()  %>%
    enframe("document_id", "ngram")  %>%
    mutate(document_id = as.integer(document_id))  %>%
    unnest()  %>%
    arrange(document_id)  %>%
    group_by(document_id)  %>%
    mutate(order_in_corpus = row_number())
}




# 2. NESTED N-GRAM-LEVEL DATA  --------------------------------
##  DATA BY TOPIC W/ NESTED COLUMNS HOLDING CORPUS AND NGRAMS

corpus_df <- long_df  %>%
  group_by(topic)  %>%
  nest()  %>%
  mutate(corpus_threegram = map(data, create_corpus, n = 3))   %>%
  mutate(corpus_fivegram = map(data, create_corpus, n = 5))   %>%
  mutate(corpus_sevengram = map(data, create_corpus, n = 7))   %>%
  mutate(corpus_skipfive = map(data, skip_corpus))  %>%
  mutate(threegram = map(corpus_threegram, extract_ngrams)) %>%
  mutate(fivegram = map(corpus_fivegram, extract_ngrams)) %>%
  mutate(sevengram = map(corpus_sevengram, extract_ngrams))  %>%
  ungroup()

# NOTE: expect warnings for irrelevant documents ("Skipping document with ID '[a
# document id]' because it has too few words to create at least two n-grams with
# ...")

# 3. SAVE ------------------

attr(corpus_df, "source") <- "Created in 'DataClean/SC-2- Extract Corpora.R'"
write_rds(corpus_df, path = "../Data/Derived/corpus_df.rds")


# DISPLAY VERSION NUMBERS FOR R & PACKAGES IN USE
sessionInfo()

